library(caret)
library(lme4)
library(lmerTest)
library(ggplot2)
library(stringi)
library(gridExtra)
library(ggfortify)
library(dendextend)
library(psych)
library(kableExtra)
library(tidyverse)
library(dtplyr)
source('other_functions.R')
source('plotting_functions.R')
# set load_outputdata=TRUE if you want to skip long computations (normalization and DEA) and just load the previously saved data sets
load_outputdata=FALSE
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('log2Intensity', 'Intensity', 'Ratio')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw') # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.color <- tribble(
~Condition, ~Color,
"0.125", 'black',
"0.5", 'blue',
"0.667", 'green',
"1", 'red' )
# create data frame with sample info (distinct Run,Channel, Condition, Color)
sample.info <- get_sample_info(dat.l, condition.color)
channelNames <- remove_factors(unique(sample.info$Channel))
dat.unit.l <- emptyList(variant.names)
dat.unit.l$log2Intensity <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)
dat.unit.l$Intensity <- dat.l %>% rename(response=Intensity)
Calculate ratio (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.
# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% distinct(Channel) %>% pull(Channel)
denom.df=dat.l %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='Intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$Ratio <- dat.l %>% left_join(denom.df[c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')], by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=Intensity/denom) %>% select(-c(Intensity, denom))
# no summarization
dat.summ.l <- dat.unit.l
if (!load_outputdata){
dat.norm.l <- dat.summ.l
# fit normalization model
norm.models <- lapply(dat.summ.l, function(x) return(lmer(response ~ Mixture + Mixture:TechRepMixture + Mixture:TechRepMixture:Channel + (1|Protein) + (1|Mixture:TechRepMixture:Peptide), data=x)))
# assign normalized values
for (i in 1:n.comp.variants){ dat.norm.l[[variant.names[i]]]$response <- residuals(norm.models[[variant.names[i]]]) }
# apply the 'fix' - add the model intercept to the residuals
dat.norm.l$IntensityFix <- dat.norm.l$Intensity; dat.norm.l$IntensityFix$response <- dat.norm.l$IntensityFix$response + fixef(norm.models$Intensity)['(Intercept)']
dat.norm.l$RatioFix <- dat.norm.l$Ratio; dat.norm.l$RatioFix$response <- dat.norm.l$RatioFix$response + fixef(norm.models$Ratio)['(Intercept)']
# update variant names, #, scale
variant.names <- names(dat.norm.l)
scale.vec <- c(scale.vec, 'raw', 'raw')
n.comp.variants <- length(variant.names)
tmp <- c("log2Intensity", "Intensity", "IntensityFix", "Ratio", "RatioFix")
variant.names <- tmp
dat.norm.l <- dat.norm.l[tmp]
# for technical reasons, assign new variants to dat.summ.l
dat.summ.l$IntensityFix <- dat.summ.l$Intensity
dat.summ.l$RatioFix <- dat.summ.l$Ratio
dat.summ.l <- dat.summ.l[tmp]
}
if (!load_outputdata){
dat.nonnorm.summ.l <- lapply(dat.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median'))
dat.nonnorm.summ.l <- lapply(dat.nonnorm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median'))
dat.norm.summ.l <- lapply(dat.norm.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median'))
dat.norm.summ.l <- lapply(dat.norm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median'))
# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.l, function(x){
dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':')
return(dat.tmp)})
## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.l, function(x){
dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':')
return(dat.tmp)})
}
par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
boxplot_ils(dat.nonnorm.summ.l[[variant.names[i]]], paste('Raw', variant.names[i], sep='_'))
boxplot_ils(dat.norm.summ.l[[variant.names[i]]], paste('Normalized', variant.names[i], sep='_'))}
MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).
for (i in 1:n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}
MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).
channels.num <- sample.info %>% filter(Condition=='1') %>% distinct(Run:Channel) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% distinct(Run:Channel) %>% pull
for (i in 1:n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
pca.scale=FALSE
if (variant.names[i] %in% c('Intensity', 'IntensityFix')) pca.scale=TRUE
pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'), scale=pca.scale)
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'), scale=pca.scale)}
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
pca.scale=FALSE
if (variant.names[i] %in% c('Intensity', 'IntensityFix')) pca.scale=TRUE
pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'), scale=pca.scale)
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'), scale=pca.scale)}
HC (hierarchical clustering) plots
par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
dendrogram_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'))
dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}
Below the histograms of p-values from the linear model explaining the response variable with Run as a covariate.
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
hist_ils(run_test(dat.nonnorm.summ.l[[variant.names[i]]])$pvalue, main=paste('Raw', variant.names[i], sep='_'))
hist_ils(run_test(dat.norm.summ.l[[variant.names[i]]])$pvalue, main=paste('Normalized', variant.names[i], sep='_'))}
Intra-protein correlation modelled with 1|Run:Channel random effect.
if (!load_outputdata){
dat.dea <- emptyList(variant.names)
for(i in seq_along(dat.dea)){
dat.dea[[variant.names[i]]] <- lmm_dea(dat=dat.norm.l[[variant.names[i]]], mod.formula='response ~ Condition + (1|Run:Channel)', scale=scale.vec[i], referenceCondition)}
}
# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)
cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4060 | 4 | 4015 | 13 | 4016 | 13 | 4060 | 5 | 4060 | 5 |
| DE | 3 | 15 | 48 | 6 | 48 | 6 | 4 | 14 | 4 | 14 |
| log2Intensity | Intensity | IntensityFix | Ratio | RatioFix | |
|---|---|---|---|---|---|
| Accuracy | 0.998 | 0.985 | 0.985 | 0.998 | 0.998 |
| Sensitivity | 0.789 | 0.316 | 0.316 | 0.737 | 0.737 |
| Specificity | 0.999 | 0.988 | 0.988 | 0.999 | 0.999 |
| PPV | 0.833 | 0.111 | 0.111 | 0.778 | 0.778 |
| NPV | 0.999 | 0.997 | 0.997 | 0.999 | 0.999 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4063 | 17 | 4020 | 19 | 4021 | 19 | 4062 | 15 | 4062 | 15 |
| DE | 0 | 2 | 43 | 0 | 43 | 0 | 2 | 4 | 2 | 4 |
| log2Intensity | Intensity | IntensityFix | Ratio | RatioFix | |
|---|---|---|---|---|---|
| Accuracy | 0.996 | 0.985 | 0.985 | 0.996 | 0.996 |
| Sensitivity | 0.105 | 0.000 | 0.000 | 0.211 | 0.211 |
| Specificity | 1.000 | 0.989 | 0.989 | 1.000 | 1.000 |
| PPV | 1.000 | 0.000 | 0.000 | 0.667 | 0.667 |
| NPV | 0.996 | 0.995 | 0.995 | 0.996 | 0.996 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4060 | 4 | 4061 | 10 | 4062 | 10 | 4059 | 4 | 4059 | 4 |
| DE | 3 | 15 | 2 | 9 | 2 | 9 | 5 | 15 | 5 | 15 |
| log2Intensity | Intensity | IntensityFix | Ratio | RatioFix | |
|---|---|---|---|---|---|
| Accuracy | 0.998 | 0.997 | 0.997 | 0.998 | 0.998 |
| Sensitivity | 0.789 | 0.474 | 0.474 | 0.789 | 0.789 |
| Specificity | 0.999 | 1.000 | 1.000 | 0.999 | 0.999 |
| PPV | 0.833 | 0.818 | 0.818 | 0.750 | 0.750 |
| NPV | 0.999 | 0.998 | 0.998 | 0.999 | 0.999 |
scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)
scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)
for (i in 1:n.contrasts){
volcanoplot_ils(dat.dea, i, spiked.proteins)}
Let’s see whether the spiked protein fold changes make sense
# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dtplyr_1.0.1 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [5] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.4
## [9] tidyverse_1.3.0 kableExtra_1.3.1 psych_2.0.9 dendextend_1.14.0
## [13] ggfortify_0.4.11 gridExtra_2.3 stringi_1.5.3 lmerTest_3.1-3
## [17] lme4_1.1-25 Matrix_1.2-18 caret_6.0-86 ggplot2_3.3.2
## [21] lattice_0.20-41 rmarkdown_2.5
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.10 plyr_1.8.6 splines_4.0.3
## [5] dplR_1.7.1 BiocParallel_1.22.0 TH.data_1.0-10 digest_0.6.26
## [9] foreach_1.5.1 htmltools_0.5.0 viridis_0.5.1 ROTS_1.16.0
## [13] fansi_0.4.1 magrittr_1.5 doParallel_1.0.16 limma_3.44.3
## [17] recipes_0.1.15 modelr_0.1.8 gower_0.2.2 matrixStats_0.57.0
## [21] R.utils_2.10.1 sandwich_3.0-0 colorspace_1.4-1 signal_0.7-6
## [25] rvest_0.3.6 JGmisc_0.3 haven_2.3.1 xfun_0.18
## [29] libcoin_1.0-6 crayon_1.3.4 jsonlite_1.7.1 impute_1.62.0
## [33] zoo_1.8-8 survival_3.2-7 iterators_1.0.13 glue_1.4.2
## [37] gtable_0.3.0 ipred_0.9-9 zlibbioc_1.34.0 webshot_0.5.2
## [41] NOMAD_0.99.0 BiocGenerics_0.34.0 scales_1.1.1 mvtnorm_1.1-1
## [45] vsn_3.56.0 DBI_1.1.0 Rcpp_1.0.5 mzR_2.22.0
## [49] viridisLite_0.3.0 tmvnsim_1.0-2 CONSTANd_0.99.0 preprocessCore_1.50.0
## [53] matrixTests_0.1.9 stats4_4.0.3 lava_1.6.8.1 prodlim_2019.11.13
## [57] httr_1.4.2 modeltools_0.2-23 ellipsis_0.3.1 pkgconfig_2.0.3
## [61] XML_3.99-0.5 R.methodsS3_1.8.1 farver_2.0.3 nnet_7.3-14
## [65] dbplyr_2.0.0 utf8_1.1.4 tidyselect_1.1.0 labeling_0.4.2
## [69] rlang_0.4.8 reshape2_1.4.4 munsell_0.5.0 cellranger_1.1.0
## [73] tools_4.0.3 cli_2.1.0 generics_0.1.0 broom_0.7.2
## [77] evaluate_0.14 mzID_1.26.0 yaml_2.2.1 ModelMetrics_1.2.2.2
## [81] knitr_1.30 fs_1.5.0 coin_1.3-1 ncdf4_1.17
## [85] nlme_3.1-149 R.oo_1.24.0 xml2_1.3.2 compiler_4.0.3
## [89] rstudioapi_0.12 png_0.1-7 e1071_1.7-4 affyio_1.58.0
## [93] reprex_0.3.0 statmod_1.4.35 highr_0.8 MSnbase_2.14.2
## [97] ProtGenerics_1.20.0 nloptr_1.2.2.2 vctrs_0.3.4 pillar_1.4.6
## [101] lifecycle_0.2.0 BiocManager_1.30.10 MALDIquant_1.19.3 data.table_1.13.2
## [105] R6_2.5.0 pcaMethods_1.80.0 affy_1.66.0 IRanges_2.22.2
## [109] codetools_0.2-16 boot_1.3-25 MASS_7.3-53 assertthat_0.2.1
## [113] withr_2.3.0 mnormt_2.0.2 multcomp_1.4-14 S4Vectors_0.26.1
## [117] mgcv_1.8-33 parallel_4.0.3 hms_0.5.3 grid_4.0.3
## [121] rpart_4.1-15 timeDate_3043.102 minqa_1.2.4 class_7.3-17
## [125] pROC_1.16.2 numDeriv_2016.8-1.1 Biobase_2.48.0 lubridate_1.7.9